# Loading packages
library(readtext)
library(readxl)
library(dplyr)
library(tidyverse)
library(tidytext)
library(tm)
library(textstem)
library(wordcloud)
library(slam)
library(topicmodels)
library(SentimentAnalysis)

Introduction

(…)

Background

(…)

What is Federal Open Market Committee?

Federal Open Market Committee (FOMC) is the body of the central bank of United States (the Federal Reserve System). Its main duties is setting the national monetary policy. The FOMC makes all decisions regarding the federal funds rate, the size and composition of the Federal Reserve’s asset holdings, and communications with the public about the likely future course of monetary policy. The FOMC consists of 12 voting members: seven members of the Board of Governors, the president of the Federal Reserve Bank of New York and 4 of the remaining 11 Reserve Bank presidents, who serve one-year terms on a rotating basis. All 12 of the Reserve Bank presidents attend FOMC meetings and participate in FOMC discussions, but only the presidents who are Committee members at the time may vote on policy decisions. FOMC meetings typically are held eight times each year in Washington, D.C., and at other times as needed.

How are statements organized?

Immediately after each FOMC meeting, (…)

Statistical analysis

(…)

Data description

(…) https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm + algorytm skrapowania

Text preparation

(…)

# Loading scrapped statements (from 2006 to 2018)
# DATA_DIR <- "C:/Users/KAndr/OneDrive/Studia/II rok I semestr/Text mining/Text mining project/Statements/"
DATA_DIR = "C:/Users/esobolewska/Documents/FOMC-text-mining/Statements"

fomc_2006 <- readtext(paste0(DATA_DIR, "/2006/*"))
fomc_2007 <- readtext(paste0(DATA_DIR, "/2007/*"))
fomc_2008 <- readtext(paste0(DATA_DIR, "/2008/*"))
fomc_2009 <- readtext(paste0(DATA_DIR, "/2009/*"))
fomc_2010 <- readtext(paste0(DATA_DIR, "/2010/*"))
fomc_2011 <- readtext(paste0(DATA_DIR, "/2011/*"))
fomc_2012 <- readtext(paste0(DATA_DIR, "/2012/*"))
fomc_2013 <- readtext(paste0(DATA_DIR, "/2013/*"))
fomc_2014 <- readtext(paste0(DATA_DIR, "/2014/*"))
fomc_2015 <- readtext(paste0(DATA_DIR, "/2015/*"))
fomc_2016 <- readtext(paste0(DATA_DIR, "/2016/*"))
fomc_2017 <- readtext(paste0(DATA_DIR, "/2017/*"))
fomc_2018 <- readtext(paste0(DATA_DIR, "/2018/*"))
# Binding data
statements <- rbind(fomc_2006, fomc_2007, fomc_2008, fomc_2009, fomc_2010, fomc_2011,
                    fomc_2012, fomc_2013, fomc_2014, fomc_2015, fomc_2016, fomc_2017, fomc_2018)
# Removing files from memory
remove(fomc_2006, fomc_2007, fomc_2008, fomc_2009, fomc_2010, fomc_2011,
       fomc_2012, fomc_2013, fomc_2014, fomc_2015, fomc_2016, fomc_2017, fomc_2018)
# Initial preprocessing
statements <- statements %>% mutate(ID = 1:n())
colnames(statements) <- c("Date", "Text", "ID")
statements$Date <- gsub(".txt", "", statements$Date)
statements$Date <- as.Date(statements$Date, "%Y%m%d ")
statements_all <- as.vector(statements$Text)
length(statements_all) # 107 documents
## [1] 107
# Converting documents into corpus
(corpus_all <- VCorpus(VectorSource(statements_all)))
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 107
inspect(corpus_all[[1]])
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 778
## 
## The Federal Open Market Committee decided today to raise its target for the federal funds rate by 25 basis points to 4-1/2 percent. Although recent economic data have been uneven, the expansion in economic activity appears solid. Core inflation has stayed relatively low in recent months and longer-term inflation expectations remain contained. Nevertheless, possible increases in resource utilization as well as elevated energy prices have the potential to add to inflation pressures. The Committee judges that some further policy firming may be needed to keep the risks to the attainment of both sustainable economic growth and price stability roughly in balance. In any event, the Committee will respond to changes in economic prospects as needed to foster these objectives.
as.character(corpus_all[[1]]) 
## [1] "The Federal Open Market Committee decided today to raise its target for the federal funds rate by 25 basis points to 4-1/2 percent. Although recent economic data have been uneven, the expansion in economic activity appears solid. Core inflation has stayed relatively low in recent months and longer-term inflation expectations remain contained. Nevertheless, possible increases in resource utilization as well as elevated energy prices have the potential to add to inflation pressures. The Committee judges that some further policy firming may be needed to keep the risks to the attainment of both sustainable economic growth and price stability roughly in balance. In any event, the Committee will respond to changes in economic prospects as needed to foster these objectives. "

Preprocessing

stopwords <- stopwords("en")
system.time (
  corpus_clean <- corpus_all %>% 
    tm_map(tolower) %>%
    tm_map(removeWords, stopwords) %>% 
    tm_map(removePunctuation) %>%
    tm_map(removeNumbers)  %>%
    tm_map(stripWhitespace) %>% 
    tm_map(PlainTextDocument)
)
##    user  system elapsed 
##    0.25    0.00    0.27
# example statement after cleaning
as.character(corpus_clean[[1]]) 
## [1] " federal open market committee decided today raise target federal funds rate basis points percent although recent economic data uneven expansion economic activity appears solid core inflation stayed relatively low recent months longerterm inflation expectations remain contained nevertheless possible increases resource utilization well elevated energy prices potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives "
# example statement after cleaning
as.character(corpus_clean[[2]]) 
## [1] " federal open market committee decided today raise target federal funds rate basis points percent slowing growth real gdp fourth quarter seems largely reflected temporary special factors economic growth rebounded strongly current quarter appears likely moderate sustainable pace yet run prices energy commodities appears modest effect core inflation ongoing productivity gains helped hold growth unit labor costs check inflation expectations remain contained still possible increases resource utilization combination elevated prices energy commodities potential add inflation pressures committee judges policy firming may needed keep risks attainment sustainable economic growth price stability roughly balance event committee will respond changes economic prospects needed foster objectives"
df_corpus <- data.frame(text = unlist(sapply(corpus_clean, `[`, "content")), stringsAsFactors = F)
df_corpus <- df_corpus %>% mutate(doc_id = 1:n())
statements_clean <- statements %>% 
  mutate(cleaned_text = df_corpus$text)

count_cleaned_word <- statements_clean %>%
  unnest_tokens(word_count, cleaned_text) %>%
  count(ID, word_count, sort = T) %>% 
  group_by(ID) %>% 
  summarize(word_cleaned_count = sum(n))

statements_clean_count <- left_join(statements_clean, count_cleaned_word, by = 'ID')

count_word <- statements_clean_count %>%
  unnest_tokens(word_count, Text) %>%
  count(ID, word_count, sort = T) %>% 
  group_by(ID) %>% 
  summarize (word_count = sum(n))

statements_final <- left_join(statements_clean_count, count_word, by = 'ID')

Word counts over time

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
myplot <- statements_final %>% 
              select(word_count, Date) %>% 
              ggplot() +
              geom_line(aes(x=Date, y= word_count)) + 
              theme_minimal()
ggplotly(myplot)

tfidf

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
statements_words <- statements_clean_count %>%
  mutate(year = year(Date)) %>% 
  unnest_tokens(word_count, Text) %>%
  count(year, word_count, sort = T)

statements_words
## # A tibble: 4,789 x 3
##     year word_count     n
##    <dbl> <chr>      <int>
##  1  2014 the          411
##  2  2013 the          344
##  3  2015 the          288
##  4  2016 the          281
##  5  2017 the          266
##  6  2011 the          236
##  7  2012 the          221
##  8  2014 and          202
##  9  2014 of           199
## 10  2013 and          195
## # ... with 4,779 more rows
statements_words <- statements_words %>%
  bind_tf_idf(word_count, year, n)

statements_words
## # A tibble: 4,789 x 6
##     year word_count     n     tf   idf tf_idf
##    <dbl> <chr>      <int>  <dbl> <dbl>  <dbl>
##  1  2014 the          411 0.0696     0      0
##  2  2013 the          344 0.0657     0      0
##  3  2015 the          288 0.0674     0      0
##  4  2016 the          281 0.0675     0      0
##  5  2017 the          266 0.0696     0      0
##  6  2011 the          236 0.0685     0      0
##  7  2012 the          221 0.0638     0      0
##  8  2014 and          202 0.0342     0      0
##  9  2014 of           199 0.0337     0      0
## 10  2013 and          195 0.0372     0      0
## # ... with 4,779 more rows
pd = statements_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word_count, levels = rev(unique(word_count)))) %>% 
  group_by(year) %>% 
  top_n(10) %>% 
  ungroup() %>%
  arrange(year, tf_idf) %>%
  mutate(order = row_number()) 
## Selecting by word
ggplot(pd, aes(order, tf_idf, fill = year)) +
  geom_bar(show.legend = FALSE, stat = "identity") +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~year, ncol = 3, scales = "free") +
  scale_x_continuous(
    breaks = pd$order,
    labels = pd$word,
    expand = c(0,0)) +
  coord_flip()

Wordclouds

library(wordcloud)
dtm <- TermDocumentMatrix(corpus_clean)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
##                  word freq
## committee   committee  915
## inflation   inflation  834
## will             will  638
## economic     economic  556
## market         market  446
## federal       federal  445
## rate             rate  418
## labor           labor  331
## conditions conditions  326
## securities securities  314
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Associations and network analysis

Sentiment analysis

# Lemmatization
statements_final$lemma_text <- lemmatize_strings(statements_final$cleaned_text)
# Tokenization
tokens <- statements_final %>%
  unnest_tokens(word, lemma_text) 

Topic modelling

# topic modelling - do poprawy na pewno, bo słabo wychodzi

# install.packages("topicmodels")

# Creating a Term document Matrix
tdm = DocumentTermMatrix(corpus_clean) 

# create tf-idf matrix
term_tfidf <- tapply(tdm$v/row_sums(tdm)[tdm$i], tdm$j, mean) * log2(nDocs(tdm)/col_sums(tdm > 0))
summary(term_tfidf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.01091 0.01977 0.02410 0.02952 0.16749
tdm <- tdm[,term_tfidf >= 0.05]
tdm <- tdm[row_sums(tdm) > 0,]
summary(col_sums(tdm))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    1.00    2.33    2.00   36.00
# finding best K 
best.model <- lapply(seq(2, 50, by = 1), function(d){LDA(tdm, d)})
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik)))
# calculating LDA
k = 5 # number of topics
SEED = 112 # number of documents 
CSC_TM <-list(VEM = LDA(tdm, k = k, 
                        control = list(seed = SEED)), 
              VEM_fixed = LDA(tdm, k = k,
                              control = list(estimate.alpha = FALSE, seed = SEED)),
              Gibbs = LDA(tdm, k = k, method = "Gibbs",
                          control = list(seed = SEED, burnin = 1000, thin = 100, iter = 1000)),
              CTM = CTM(tdm, k = k,
                        control = list(seed = SEED,
                                       var = list(tol = 10^-4), 
                                       em = list(tol = 10^-3))))

sapply(CSC_TM[1:2], slot, "alpha")
##         VEM   VEM_fixed 
##  0.02486956 10.00000000
sapply(CSC_TM, function(x) mean(apply(posterior(x)$topics, 1, function(z) sum(z*log(z)))))
##        VEM  VEM_fixed      Gibbs        CTM 
## -0.2206000 -1.5846064 -1.5947876 -0.5421278
Topic <- topics(CSC_TM[["VEM"]], 1)
Terms <- terms(CSC_TM[["VEM"]], 8)
Terms
##      Topic 1             Topic 2           Topic 3        Topic 4             
## [1,] "demandnonetheless" "tslf"            "bank"         "deepening"         
## [2,] "impetus"           "growthinflation" "central"      "mitigate"          
## [3,] "arise"             "spurred"         "arrangements" "carefullytodayâ\200\231s"
## [4,] "carefullyâ"        "consult"         "swiss"        "remainâ"           
## [5,] "closed"            "dealers"         "european"     "timely"            
## [6,] "forestall"         "percentstrains"  "funding"      "central"           
## [7,] "funding"           "pricesreadings"  "dollar"       "cuts"              
## [8,] "operations"        "softening"       "ecb"          "downturnâ"         
##      Topic 5       
## [1,] "yet"         
## [2,] "fail"        
## [3,] "predominant" 
## [4,] "adjustment"  
## [5,] "moderation"  
## [6,] "check"       
## [7,] "combination" 
## [8,] "convincingly"